In [1]:
from __future__ import division, print_function
In [2]:
import numpy as np
import mmd
We're storing the data as a list of n numpy arrays, each of size n_samp x dim (with dim == 1).
In [3]:
n = 500
mean = np.random.normal(0, 10, size=n)
var = np.random.gamma(5, size=n)
n_samp = np.random.randint(10, 500, size=n)
samps = [np.random.normal(m, v, size=s)[:, np.newaxis]
for m, v, s in zip(mean, var, n_samp)]
In [4]:
# this gives us a progress bar for MMD computations
from mmd.utils import show_progress
show_progress('mmd.mmd.progress')
In [5]:
# Get the median pairwise squared distance in the aggregate sample,
# as a heuristic for choosing the bandwidth of the inner RBF kernel.
from sklearn.metrics.pairwise import euclidean_distances
sub = np.vstack(samps)
sub = sub[np.random.choice(sub.shape[0], min(1000, sub.shape[0]), replace=False)]
D2 = euclidean_distances(sub, squared=True)
med_2 = np.median(D2[np.triu_indices_from(D2, k=1)], overwrite_input=True)
del sub, D2
In [6]:
from sklearn import cross_validation as cv
from sklearn.kernel_ridge import KernelRidge
import sys
In [7]:
l1_gamma_mults = np.array([1/16, 1/4, 1, 4]) # Could expand these, but it's quicker if you don't. :)
l1_gammas = l1_gamma_mults * med_2
Now we'll get the $\mathrm{MMD}^2$ values for each of the proposed gammas. (This is maybe somewhat faster than doing it independently, but I haven't really tested that; if it's causing memory issues or anything, do them separately.)
We also want to save the "diagonal" values (the mean map kernel between a set and itself), so we can compute the MMD to test values later without recomputing those.
In [8]:
mmds, mmk_diags = mmd.rbf_mmd(samps, gammas=l1_gammas, squared=True, n_jobs=40, ret_X_diag=True)
Now, we want to turn this into a kernel and evaluate the regression for each of the other hyperparameter values.
We'll just do a 3d grid search here.
Ideally, this would be:
In [9]:
# Choose parameters for the hyperparameter search
k_fold = list(cv.KFold(n, n_folds=3, shuffle=True))
l2_gamma_mults = np.array([1/4, 1, 4, 8])
alphas = np.array([1/128, 1/64, 1/16, 1/4, 1, 4])
scores = np.empty((l1_gamma_mults.size, l2_gamma_mults.size, alphas.size, len(k_fold)))
scores.fill(np.nan)
In [10]:
%%time
K = np.empty((n, n), dtype=samps[0].dtype)
for l1_gamma_i, l1_gamma in enumerate(l1_gamma_mults * med_2):
print("l1 gamma {} / {}: {:.4}".format(l1_gamma_i + 1, len(l1_gamma_mults), l1_gamma), file=sys.stderr)
D2_mmd = mmds[l1_gamma_i]
# get the median of *these* squared distances,
# to scale the bandwidth of the outer RBF kernel
mmd_med2 = np.median(D2_mmd[np.triu_indices_from(D2_mmd, k=1)])
for l2_gamma_i, l2_gamma in enumerate(l2_gamma_mults * mmd_med2):
print("\tl2 gamma {} / {}: {:.4}".format(l2_gamma_i + 1, len(l2_gamma_mults), l2_gamma), file=sys.stderr)
np.multiply(D2_mmd, -l2_gamma, out=K)
np.exp(K, out=K)
for alpha_i, alpha in enumerate(alphas):
ridge = KernelRidge(alpha=alpha, kernel='precomputed')
these = cv.cross_val_score(ridge, K, mean, cv=k_fold)
scores[l1_gamma_i, l2_gamma_i, alpha_i, :] = these
print("\t\talpha {} / {}: {} \t {}".format(alpha_i + 1, len(alphas), alpha, these), file=sys.stderr)
In [11]:
mean_scores = scores.mean(axis=-1)
In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [13]:
import matplotlib.ticker as ticker
fig = plt.figure(figsize=(8, 15))
for i in range(len(l1_gamma_mults)):
ax = fig.add_subplot(len(l1_gamma_mults), 1, i+1)
cax = ax.matshow(mean_scores[i, :, :], vmin=0, vmax=1)
ax.set_yticklabels([''] + ['{:.3}'.format(g * mmd_med2) for g in l2_gamma_mults])
ax.set_xticklabels([''] + ['{:.3}'.format(a) for a in alphas])
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
fig.colorbar(cax)
ax.set_title("L1 gamma: {}".format(l1_gamma_mults[i] * med_2))
plt.tight_layout()
Looking at these results, the best results are with the lowest alpha, lowest L1 gamma, and highest L2 gamma. So if we were really trying to solve this as best as possible, we'd maybe want to try lower alphas / lower L1 gammas / higher L2 gammas. But, whatever; let's just use the best of the ones we did try for now.
In [14]:
best_l1_gamma_i, best_l2_gamma_i, best_alpha_i = np.unravel_index(mean_scores.argmax(), mean_scores.shape)
best_l1_gamma = l1_gamma_mults[best_l1_gamma_i] * med_2
best_l2_gamma = l2_gamma_mults[best_l2_gamma_i] * mmd_med2
best_alpha = alphas[best_alpha_i]
In [16]:
best_l1_gamma_i, best_l2_gamma_i, best_alpha_i
Out[16]:
In [15]:
best_l1_gamma, best_l2_gamma, best_alpha
Out[15]:
Now, train a model on the full training set:
In [17]:
# get the training kernel
D2_mmd = mmds[best_l1_gamma_i]
np.multiply(D2_mmd, -best_l2_gamma, out=K)
np.exp(K, out=K)
ridge = KernelRidge(alpha=best_alpha, kernel='precomputed')
ridge.fit(K, mean)
Out[17]:
To evaluate on new data:
In [18]:
# generate some test data from the same distribution
t_n = 100
t_mean = np.random.normal(0, 10, size=t_n)
t_var = np.random.gamma(5, size=t_n)
t_n_samp = np.random.randint(10, 500, size=t_n)
t_samps = [np.random.normal(m, v, size=s)[:, np.newaxis]
for m, v, s in zip(t_mean, t_var, t_n_samp)]
In [19]:
# get the kernel from the training data to the test data
t_K = mmd.rbf_mmd(t_samps, samps, gammas=best_l1_gamma, squared=True,
Y_diag=mmk_diags[best_l1_gamma_i], n_jobs=20)
t_K *= -best_l2_gamma
np.exp(t_K, out=t_K);
In [20]:
preds = ridge.predict(t_K)
In [21]:
plt.figure(figsize=(5, 5))
plt.scatter(t_mean, preds)
Out[21]:
Pretty good predictions!